The settings list settings configure which parts, simulations, and analysis steps of this R markdown are executed. This way single components of the analysis can be disabled for the purpose of saving computation time.
settings <- list(
# Application of the full DANA pipeline to the benchmark data
bench.DANA=T,
# Application of the full DANA pipeline to the test data
test.DANA=T,
# Generate interactive plots of co-expression structures
investigate.coexpression=F,
# Perform Differential Expression Analysis for test and benchmark data sets
perform.DEA=T,
# Generate paper figures
generate.paper.figs=T,
# Specify if paper figures are exported
export.figures = T,
# Path for file exports
paper.fig.path = "../danawriteup/figs/",
# Clears environment
debug=TRUE
)
if(settings$debug) rm(list=ls()[ls()!="settings"])
The config list contains all parameters for the analysis. Remember to always set the working directory and the paths to the data/external files prior to running the code.
config <- list(
DANA.path = "./R/DANA.R",
## Data
case.name = "UCEC",
# Read count data file for the full UCEC data set
data.file.path = "data/TCGA_harmonized_UCEC.RData",
# RData file for coordinates data frame based on miRBase miRNA definitions
coords.file.path = "data/miRBase.coords.RData",
## Normalization Methods
# Specify which normalization methods will be applied
norm.apply.TC = TRUE,
norm.apply.UQ = TRUE,
norm.apply.med = TRUE,
norm.apply.TMM = TRUE,
norm.apply.DESeq = TRUE,
norm.apply.PoissonSeq = TRUE,
norm.apply.QN = TRUE,
norm.apply.RUV = TRUE,
# thresholds for zero-expressed, poorly-expressed and well-expressed genes
t.zero = 2, # zero-expressed in [0, t.zero)
t.poor = 5, # poorly-expressed in [t.zero, t.well]
t.well = 2^6, # well-expressed in [t.well, inf)
# distance threshold for clustering
cluster.threshold = 10000,
# preprocessing data transformation type: none ("n"), modified log2 ("log2"),
# or cube root ("cube") available
preprocess.transform = "log2",
## Correlation Estimation for positive and negative controls
# Available methods | Tuning parameter calibration schemes
# "mb" | "cv", "aic", "bic", "av"
# "huge.mb" | "ric", "stars"
# "glasso" | "ric", "stars", "ebic"
# "fastggm" | none
# "pearson" | none
# "spearman" | none
# Positive Controls
corr.method.pos = "mb",
tuning.criterion.pos = "bic",
# Negative Controls
corr.method.neg = "pearson",
tuning.criterion.neg = "",
# Generate plots within DANA
generate.plots = FALSE # generate comparison plots
)
Configuration for the differential expression analysis
config.DEA <- list(
## Data
case.name = "UCEC",
## Normalization Methods
# Specify which normalization methods will be applied
norm.apply.TC = TRUE,
norm.apply.UQ = TRUE,
norm.apply.med = TRUE,
norm.apply.TMM = TRUE,
norm.apply.DESeq = TRUE,
norm.apply.PoissonSeq = TRUE,
norm.apply.QN = TRUE,
norm.apply.RUV = TRUE,
# Significance level for DEA
alpha = 0.01,
# Plots
generate.volcano.plot = TRUE,
generate.meanvar.plot = TRUE,
# Use q-values (adjusted p-values) instead of p-values
q.values = FALSE,
# RUV reduces the parameter size. Reduce DEA result to RUV genes?
perform.subsetting = FALSE
)
Load all required R libraries/packages.
# DANA functions
source(config$DANA.path)
# Libraries
library(openxlsx) # read excel files
library(corrplot) # For mixed correlation plots
library(cowplot) # Arrange plots
library(RColorBrewer) # Colors
library(latex2exp) # for latex in axis labels
library(ggcorrplot) # ggplot2 correlation plots
We load the data set into memory.
# Load the p x n matrix of read counts into the workspace
load(config$data.file.path)
# Unlist the data
benchmark <- TCGA.UCEC$benchmark
test <- TCGA.UCEC$test
benchmark.end <- benchmark$END
benchmark.ser <- benchmark$SER
test.end <- test$END
test.ser <- test$SER
# Transform gene names to lower case
rownames(benchmark.end) <- tolower(rownames(benchmark.end))
rownames(benchmark.ser) <- tolower(rownames(benchmark.ser))
rownames(test.end) <- tolower(rownames(test.end))
rownames(test.ser) <- tolower(rownames(test.ser))
# Rename
test.RC <- test <- cbind(test.end, test.ser)
test.groups <- c(rep("END", dim(test.end)[2]), rep("SER", dim(test.ser)[2]))
bench.RC <- benchmark <- cbind(benchmark.end, benchmark.ser)
bench.groups <- c(rep("END", dim(benchmark.end)[2]), rep("SER", dim(benchmark.ser)[2]))
# Inspect
cat("Dimensions of test (mixed-batch) data: ", dim(test.RC), "\n")
## Dimensions of test (mixed-batch) data: 1881 48
cat("Dimensions of benchmark (single-batch) data: ", dim(bench.RC), "\n")
## Dimensions of benchmark (single-batch) data: 1881 48
A coordinates data frame that specifies the base pair location of each miRNA of the data set on the chromosomes is loaded.
# Load miRNA chromosome location coordinates data frame
load(config$coords.file.path)
coords <- coords # change according to the name of the loaded data matrix
Some miRNAs map to multiple locations of sequence families. We named the sequence family with a parenthesis that reflects the number of members from all the different genomic locations (e.g. hsa-let-7a(3)). These miRNAs cannot be uniquely mapped to the genome, therefore we must exclude these from the location based analysis.
# only consider genes that are present in the coordinate data frame
genes.in.coords <- rownames(coords)[na.omit(match(rownames(test.RC), rownames(coords)))]
cat(dim(test.RC)[1]-length(genes.in.coords), " genes not found in coords. Reducing data set to ", length(genes.in.coords), "genes.\n")
## 33 genes not found in coords. Reducing data set to 1848 genes.
# test data set
test.RC <- test.RC[genes.in.coords, ]
# benchmark data set
bench.RC <- bench.RC[genes.in.coords, ]
genes <- rownames(test.RC)
rm(genes.in.coords)
First, we investigate the distribution of mean miRNA expression of the data.
# Histogram plot test data set
par(mfrow=c(1,1))
df <- data.frame(log.expression=log2(rowMeans(test.RC)+1))
print(ggplot(df, aes(x=log.expression)) +
geom_histogram(binwidth = .1, color="black", fill="black") +
geom_vline(xintercept = log2(config$t.zero+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.poor+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.well+1), color="red", linetype="dashed") +
ggtitle("mixed-batch data set"))
rm(df)
# Histogram plot benchmark data set
par(mfrow=c(1,1))
df <- data.frame(log.expression=log2(rowMeans(bench.RC)+1))
print(ggplot(df, aes(x=log.expression)) +
geom_histogram(binwidth = .1, color="black", fill="black") +
geom_vline(xintercept = log2(config$t.zero+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.poor+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.well+1), color="red", linetype="dashed") +
ggtitle("single-batch data set"))
rm(df)
We observe that there is a large proportion of poorly-expressed genes. Some of them have extremely low or zero mean expression which corresponds to essentially zero reads.
We apply the full analysis pipeline to the data set. This includes:
The following parameters are used:
if(settings$bench.DANA) {
genes <- rownames(bench.RC)
## Apply Normalization
data.norm <- normalize(X=bench.RC,
groups=bench.groups,
name="benchmark",
apply.TC=config$norm.apply.TC,
apply.UQ=config$norm.apply.UQ,
apply.med=config$norm.apply.med,
apply.TMM=config$norm.apply.TMM,
apply.DESeq=config$norm.apply.DESeq,
apply.PoissonSeq=config$norm.apply.PoissonSeq,
apply.QN=config$norm.apply.QN,
apply.RUV=config$norm.apply.RUV)
bench.norm <- data.norm
## Define Controls
pre.res <- define.controls(bench.RC,
t.zero=config$t.zero,
t.poor=config$t.poor,
t.well=config$t.well,
t.cluster=config$cluster.threshold,
coords=coords,
clusters=NULL)
pos.controls <- pre.res$genes.pos
pos.controls.bench <- pre.res$genes.pos
neg.controls <- pre.res$genes.neg
neg.controls.bench <- pre.res$genes.neg
clusters <- pre.res$clusters
clusters.bench <- clusters
# Apply DANA pipeline
DANA.bench <- DANA(data.RC=bench.RC,
data.norm=bench.norm,
pos.controls=pos.controls.bench,
neg.controls=neg.controls.bench,
clusters=clusters.bench,
coords=coords,
case.name="benchmark",
generate.plots=config$generate.plots,
preprocess.transform=config$preprocess.transform,
corr.method.pos=config$corr.method.pos,
tuning.criterion.pos=config$tuning.criterion.pos,
corr.method.neg=config$corr.method.neg,
tuning.criterion.neg=config$tuning.criterion.neg)
if(!config$generate.plots) {
stargazer(DANA.bench$metrics, type="text", summary=FALSE, digits=4,
title="Comparison metrics for the single-batch data set", align=TRUE)
}
iplot.bench <- iggplot.corr(DANA.bench$data.model$pos$corr, clusters=clusters, title="Single-batch data set - Positive controls (un-normalized)", coords=coords)
iplot.bench.TMM <- iggplot.corr(DANA.bench$norm.models$benchmark.TMM$pos$corr, clusters=clusters, title="Single-batch data set - Positive controls (TMM)", coords=coords)
}
## 973 genes has been filtered because they contains too small number of reads across the experiments.Number of positive control markers with RC in [64, inf): 127
## Number of negative control markers with RC in [2, 5]: 123
## Estimating correlations for pos. and neg. controls for data set benchmark...done
## Estimating correlations for pos. and neg. controls for data set benchmark.TMM...done
## Estimating correlations for pos. and neg. controls for data set benchmark.DESeq...done
## Estimating correlations for pos. and neg. controls for data set benchmark.PoissonSeq...done
## Estimating correlations for pos. and neg. controls for data set benchmark.TC...done
## Estimating correlations for pos. and neg. controls for data set benchmark.Med...done
## Estimating correlations for pos. and neg. controls for data set benchmark.RUVg...done
## Estimating correlations for pos. and neg. controls for data set benchmark.RUVs...done
## Estimating correlations for pos. and neg. controls for data set benchmark.RUVr...done
## Estimating correlations for pos. and neg. controls for data set benchmark.UQ...done
## Estimating correlations for pos. and neg. controls for data set benchmark.QN...done
## Comparing models benchmark vs. benchmark.TMM....done
## Comparing models benchmark vs. benchmark.DESeq....done
## Comparing models benchmark vs. benchmark.PoissonSeq....done
## Comparing models benchmark vs. benchmark.TC....done
## Comparing models benchmark vs. benchmark.Med....done
## Comparing models benchmark vs. benchmark.RUVg....done
## Comparing models benchmark vs. benchmark.RUVs....done
## Comparing models benchmark vs. benchmark.RUVr....done
## Comparing models benchmark vs. benchmark.UQ....done
## Comparing models benchmark vs. benchmark.QN....done
##
## Comparison metrics for the single-batch data set
## ===================================
## cc mscr
## -----------------------------------
## benchmark.TMM 0.9844 0.1153
## benchmark.DESeq 0.9938 0.1069
## benchmark.PoissonSeq 0.9942 0.1499
## benchmark.TC 0.9821 -0.2690
## benchmark.Med 0.9136 0.1675
## benchmark.RUVg 0.9832 0.0557
## benchmark.RUVs 0.9410 0.0895
## benchmark.RUVr 0.9702 0.1971
## benchmark.UQ 0.9239 0.1283
## benchmark.QN 0.9333 0.1558
## -----------------------------------
if(settings$test.DANA) {
genes <- rownames(test.RC)
## Apply Normalization
data.norm <- normalize(test.RC,
groups=test.groups,
name="test",
apply.TC=config$norm.apply.TC,
apply.UQ=config$norm.apply.UQ,
apply.med=config$norm.apply.med,
apply.TMM=config$norm.apply.TMM,
apply.DESeq=config$norm.apply.DESeq,
apply.PoissonSeq=config$norm.apply.PoissonSeq,
apply.QN=config$norm.apply.QN,
apply.RUV=config$norm.apply.RUV)
test.norm <- data.norm
# Define Controls
pre.res <- define.controls(test.RC,
t.zero=config$t.zero,
t.poor=config$t.poor,
t.well=config$t.well,
t.cluster=config$cluster.threshold,
coords=coords,
clusters=NULL)
pos.controls <- pre.res$genes.pos
pos.controls.test <- pre.res$genes.pos
neg.controls <- pre.res$genes.neg
neg.controls.test <- pre.res$genes.neg
clusters <- pre.res$clusters
clusters.test <- clusters
# Apply DANA pipeline
DANA.test <- DANA(data.RC=test.RC,
data.norm=test.norm,
pos.controls=pos.controls.test,
neg.controls=neg.controls.test,
clusters=clusters.test,
coords=coords,
case.name="test",
generate.plots=config$generate.plots,
preprocess.transform=config$preprocess.transform,
corr.method.pos=config$corr.method.pos,
tuning.criterion.pos=config$tuning.criterion.pos,
corr.method.neg=config$corr.method.neg,
tuning.criterion.neg=config$tuning.criterion.neg)
if(!config$generate.plots) {
stargazer(DANA.test$metrics, type="text", summary=FALSE, digits=4,
title="Comparison metrics for the mixed-batch data set", align=TRUE)
}
iplot.test <- iggplot.corr(DANA.test$data.model$pos$corr, clusters=clusters, title="Mixed-batch data set - Positive controls (un-normalized)", coords=coords)
iplot.test.TMM <- iggplot.corr(DANA.test$norm.models$test.TMM$pos$corr, clusters=clusters, title="Mixed-batch data set - Positive controls (TMM)", coords=coords)
}
## 1013 genes has been filtered because they contains too small number of reads across the experiments.Number of positive control markers with RC in [64, inf): 110
## Number of negative control markers with RC in [2, 5]: 112
## Estimating correlations for pos. and neg. controls for data set test...done
## Estimating correlations for pos. and neg. controls for data set test.TMM...done
## Estimating correlations for pos. and neg. controls for data set test.DESeq...done
## Estimating correlations for pos. and neg. controls for data set test.PoissonSeq...done
## Estimating correlations for pos. and neg. controls for data set test.TC...done
## Estimating correlations for pos. and neg. controls for data set test.Med...done
## Estimating correlations for pos. and neg. controls for data set test.RUVg...done
## Estimating correlations for pos. and neg. controls for data set test.RUVs...done
## Estimating correlations for pos. and neg. controls for data set test.RUVr...done
## Estimating correlations for pos. and neg. controls for data set test.UQ...done
## Estimating correlations for pos. and neg. controls for data set test.QN...done
## Comparing models test vs. test.TMM....done
## Comparing models test vs. test.DESeq....done
## Comparing models test vs. test.PoissonSeq....done
## Comparing models test vs. test.TC....done
## Comparing models test vs. test.Med....done
## Comparing models test vs. test.RUVg....done
## Comparing models test vs. test.RUVs....done
## Comparing models test vs. test.RUVr....done
## Comparing models test vs. test.UQ....done
## Comparing models test vs. test.QN....done
##
## Comparison metrics for the mixed-batch data set
## =============================
## cc mscr
## -----------------------------
## test.TMM 0.9847 0.4744
## test.DESeq 0.9884 0.4338
## test.PoissonSeq 0.9871 0.4759
## test.TC 0.9799 0.0732
## test.Med 0.9878 0.2762
## test.RUVg 0.9800 0.4220
## test.RUVs 0.8663 0.4412
## test.RUVr 0.9703 0.4864
## test.UQ 0.9928 0.2458
## test.QN 0.9460 0.4832
## -----------------------------
We compare the estimated co-expression structures for the mixed-batch and single-batch dataset. Here, we take a look at the models for the un-normalized data as well as TMM-normalized data. Note that you cannot directly compare the graphs of benchmark with test data since the considered set of genes is not identical.
if(settings$investigate.coexpression && settings$bench.DANA && settings$test.DANA) {
htmltools::tagList(list(iplot.bench, iplot.bench.TMM, iplot.test, iplot.test.TMM)) # show plots in markdown
}
if(settings$perform.DEA) {
## Reset data
test.RC <- test
bench.RC <- benchmark
## Normalize test Data
test.norm <- normalize(test.RC,
groups=test.groups,
name="test",
apply.TC=config.DEA$norm.apply.TC,
apply.UQ=config.DEA$norm.apply.UQ,
apply.med=config.DEA$norm.apply.med,
apply.TMM=config.DEA$norm.apply.TMM,
apply.DESeq=config.DEA$norm.apply.DESeq,
apply.PoissonSeq=config.DEA$norm.apply.PoissonSeq,
apply.QN=config.DEA$norm.apply.QN,
apply.RUV=FALSE)
test.norm.RUV.adjust <- list(test.RUVr=norm.RUV(test.RC, test.groups, "RUVr")$adjust.factor,
test.RUVs=norm.RUV(test.RC, test.groups, "RUVs")$adjust.factor,
test.RUVg=norm.RUV(test.RC, test.groups, "RUVg")$adjust.factor)
}
## 1038 genes has been filtered because they contains too small number of reads across the experiments.
if(settings$perform.DEA) {
## Perform DEA on benchmark dataset
bench.DEA <- DE.voom(data.RC=bench.RC, groups=bench.groups, plot=config.DEA$generate.meanvar.plot)
if(config.DEA$generate.volcano.plot) plot.DE.volcano(bench.DEA, alpha=config.DEA$alpha, q.values=config.DEA$q.values)
}
## number of DE genes: 48
if(settings$perform.DEA) {
## Perform DEA on full dataset
test.DEA <- DE.voom(data.RC=test.RC, groups=test.groups, plot=config.DEA$generate.meanvar.plot, plot.title="test (no norm)")
if(config.DEA$generate.volcano.plot) plot.DE.volcano(test.DEA, alpha=config.DEA$alpha, q.values=config.DEA$q.values, title="test (no norm)")
## DEA for normalized test data
test.norm.DEA <- list()
for(i in 1:length(test.norm)) {
test.norm.DEA <-
append(test.norm.DEA, list(DE.voom(data.RC=test.norm[[i]],
groups=test.groups,
plot=config.DEA$generate.meanvar.plot,
plot.title=names(test.norm)[i])))
if(config.DEA$generate.volcano.plot) {
plot.DE.volcano(test.norm.DEA[[i]],
alpha=config.DEA$alpha,
q.values=config.DEA$q.values,
title=names(test.norm)[i])
}
}
## DEA for RUV normalization
for(i in 1:length(test.norm.RUV.adjust)) {
test.norm.DEA <-
append(test.norm.DEA, list(DE.voom(data.RC=test.RC,
groups=test.groups,
plot=config.DEA$generate.meanvar.plot,
plot.title=names(test.norm.RUV.adjust)[i],
adjust=test.norm.RUV.adjust[[i]])))
if(config.DEA$generate.volcano.plot) {
plot.DE.volcano(test.norm.DEA[[i]],
alpha=config.DEA$alpha,
q.values=config.DEA$q.values,
title=names(test.norm.RUV.adjust)[i])
}
}
names(test.norm.DEA) <- c(names(test.norm), names(test.norm.RUV.adjust))
}
## number of DE genes: 75
## number of DE genes: 24
## number of DE genes: 34
## number of DE genes: 31
## number of DE genes: 27
## number of DE genes: 33
## number of DE genes: 38
## number of DE genes: 38
## number of DE genes: 24
## number of DE genes: 34
## number of DE genes: 31
if(settings$perform.DEA) {
## Compare test (non-norm) to benchmark
test.norm.DEA.stats <- list(test=compare.DEAs(DEA=test.DEA,
truth.DEA=bench.DEA,
alpha=config.DEA$alpha))
## Compare normalized test data to benchmark
for(i in 1:length(test.norm.DEA)) {
test.norm.DEA.stats <-
append(test.norm.DEA.stats,
list(compare.DEAs(DEA=test.norm.DEA[[i]],
truth.DEA=bench.DEA,
alpha=config.DEA$alpha,
subset=switch(config.DEA$perform.subsetting+1,
NULL,
rownames(test.norm.DEA[[i]])))))
}
names(test.norm.DEA.stats) <- c("test.No Norm", names(test.norm.DEA))
test.norm.DEA.stats <- test.norm.DEA.stats[1:length(test.norm.DEA.stats)]
## Visualize results
# FNR-FDR Plot
test.DEA.res <- data.frame(method=substr(names(test.norm.DEA.stats), 6, 20),
FDR=-sapply(test.norm.DEA.stats, function(x) x$FDR)+1,
FNR=-sapply(test.norm.DEA.stats, function(x) x$FNR)+1)
p.test.DEA.FNR.FDR <- ggplot(test.DEA.res, aes(x=FNR, y=FDR, label=method)) +
theme_bw() +
geom_point(alpha=1) +
xlab("Sensitivity") + # True positive rate(1-FNR)
ylab("Precision") + # Positive predictive value (1-FDR)
geom_text_repel(aes(label = method), size=3) +
scale_x_continuous(labels = scales::percent, limits=c(0,1),
breaks = scales::pretty_breaks(n = 4)) +
scale_y_continuous(labels = scales::percent, limits = c(0,1))
print(p.test.DEA.FNR.FDR)
# TPR-FPR Plot
test.DEA.res <- data.frame(method=substr(names(test.norm.DEA.stats), 6, 20),
TPR=sapply(test.norm.DEA.stats, function(x) x$TPR),
FPR=-sapply(test.norm.DEA.stats, function(x) x$FPR)+1)
p.test.DEA.TPR.FPR <- ggplot(test.DEA.res, aes(x=TPR, y=FPR, label=method)) +
theme_bw() +
geom_point(alpha=1) +
xlab("TPR") +
ylab("1-FPR") +
geom_text_repel(aes(label = method), size=3) +
scale_x_continuous(labels = scales::percent, limits=c(0,1),
breaks = scales::pretty_breaks(n = 4)) +
scale_y_continuous(labels = scales::percent, limits = c(0,1))
print(p.test.DEA.TPR.FPR)
# CAT Plot
d <- append(list("test.No Norm"=test.DEA), test.norm.DEA)
names(d) <- substr(names(d), 6, 20)
p.test.DEA.CAT <- plot.CAT(DEA=d,
truth.DEA=bench.DEA,
title="Significance ranks of miRNAs",
maxrank=50)
print(p.test.DEA.CAT)
rm(d)
}
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
if(settings$generate.paper.figs) {
# Dimension of data after analysis
cat("Mixed-batch data:\n")
cat(" - Dimensions: p=", dim(test.RC)[1],", n=", dim(test.RC)[2], "\n", sep="")
cat(" - Positive controls:\n")
cat(" * Definition mean RC in interval: [", config$t.well, ", inf ]\n", sep="")
cat(" * Number of positive controls miRNAs:", length(pos.controls.test), "\n")
cat(" - Negative controls:\n")
cat(" * Definition mean RC in interval: [", config$t.zero, ",", config$t.poor, "]\n")
cat(" * Number of negative controls miRNAs:", length(neg.controls.test), "\n")
cat("\n\nSingle-batch data:\n")
cat(" - Dimensions: p=", dim(bench.RC)[1],", n=", dim(bench.RC)[2], "\n", sep="")
cat(" - Positive controls:\n")
cat(" * Definition mean RC in interval: [", config$t.well, ", inf ]\n", sep="")
cat(" * Number of positive controls miRNAs:", length(pos.controls.bench), "\n")
cat(" - Negative controls:\n")
cat(" * Definition mean RC in interval: [", config$t.zero, ",", config$t.poor, "]\n")
cat(" * Number of negative controls miRNAs:", length(neg.controls.bench), "\n")
}
## Mixed-batch data:
## - Dimensions: p=1881, n=48
## - Positive controls:
## * Definition mean RC in interval: [64, inf ]
## * Number of positive controls miRNAs: 110
## - Negative controls:
## * Definition mean RC in interval: [ 2 , 5 ]
## * Number of negative controls miRNAs: 112
##
##
## Single-batch data:
## - Dimensions: p=1881, n=48
## - Positive controls:
## * Definition mean RC in interval: [64, inf ]
## * Number of positive controls miRNAs: 127
## - Negative controls:
## * Definition mean RC in interval: [ 2 , 5 ]
## * Number of negative controls miRNAs: 123
if(settings$generate.paper.figs) {
par(mfrow=c(1,1))
# Histogram plot test data set
test.RC.log2 <- log2(rowMeans(test.RC)+1)
df <- data.frame(log.expression=test.RC.log2)
p.test.RC.hist <- ggplot(df, aes(x=log.expression)) +
geom_histogram(binwidth = .1, color="black", fill="black") +
geom_vline(xintercept = log2(config$t.zero+1), color="#3051ab", linetype="twodash") +
geom_vline(xintercept = log2(config$t.poor+1), color="#3051ab", linetype="twodash") +
geom_vline(xintercept = log2(config$t.well+1), color="9e086c", linetype="longdash") +
ggtitle("Test data set") +
theme_classic()
print(p.test.RC.hist)
rm(df)
# Histogram plot benchmark data set
bench.RC.log2 <- log2(rowMeans(bench.RC)+1)
df <- data.frame(log.expression=bench.RC.log2)
p.bench.RC.hist <- ggplot(df, aes(x=log.expression)) +
geom_histogram(binwidth = .1, color="black", fill="black") +
geom_vline(xintercept = log2(config$t.zero+1), color="#3051ab", linetype="twodash") +
geom_vline(xintercept = log2(config$t.poor+1), color="#3051ab", linetype="twodash") +
geom_vline(xintercept = log2(config$t.well+1), color="#9e086c", linetype="longdash") +
ggtitle("Benchmark data set") +
theme_classic()
print(p.bench.RC.hist)
rm(df)
# Combined read count histogram for test and benchmark data
df <- data.frame(bench=bench.RC.log2, test=test.RC.log2)
p.RC.hist <- ggplot(df) +
theme_classic() +
geom_histogram(aes(x=test, y=..count..,fill="#69b3a2"), binwidth = .1) +
geom_histogram(aes(x=bench, y=-..count.., fill="#404080"), binwidth = .1) +
geom_vline(xintercept = log2(config$t.zero+1), color="#5851b8", linetype="twodash") +
geom_vline(xintercept = log2(config$t.poor+1), color="#5851b8", linetype="twodash") +
geom_vline(xintercept = log2(config$t.well+1), color="#E7298A", linetype="longdash") +
geom_segment(aes(x = log2(config$t.zero+1), y = 180, xend = log2(config$t.poor+1), yend = 180),
arrow=arrow(length=unit(.07, "in"), ends="both"),
color="#5851b8") +
annotate(geom="label", x=(log2(config$t.zero+1)+log2(config$t.poor+1))/2.0, y=200,
label="neg. contr.", color="#5851b8", size=4, fill = alpha(c("white"),0.85), label.size = NA) +
geom_segment(aes(x = log2(config$t.well+1), y = 180, xend = 10, yend = 180),
arrow=arrow(length=unit(.07, "in"), ends="last"),
color="#E7298A") +
annotate(geom="label", x=log2(config$t.well+1)+.5, y=200,
label="pos. contr.", color="#E7298A", size=4, hjust=0, fill = alpha(c("white"),0.85), label.size = NA) +
scale_fill_manual(name="Data set",values=c("#404080", "#69b3a2"), labels=c("single-batch", "mixed-batch"), guide = guide_legend(reverse=TRUE)) +
theme(legend.position=c(0.86,0.86),
legend.background=element_rect(fill=alpha('white', 0.9))) +
scale_y_continuous(breaks=c(-200,-150,-100,-50,0,50,100,150,200), labels=c(200,150,100,50,0,50,100,150,200), limits=c(-200,200)) +
xlab("Mean log2(read count)") +
ylab("Frequency")
print(p.RC.hist)
rm(df)
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_RC_Distribution.pdf"), p.RC.hist, width = 5, height=4, device="pdf")
}
}
## Warning: Removed 1 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_bar).
if(settings$generate.paper.figs) {
par(mfrow=c(1,1))
## log2(mean(RC))
# Histogram plot test data set
test.RC.log2 <- log2(rowMeans(test.RC)+1)
bench.RC.log2 <- log2(rowMeans(bench.RC)+1)
df <- data.frame(test=test.RC.log2, benchmark=bench.RC.log2)
p.RC.comparison.1 <- ggplot(df,aes(x=test,y=benchmark)) +
geom_point(alpha=.25) +
xlab("mixed-batch") +
ylab("single-batch") +
ggtitle("log2(mean(read count) + 1)") +
geom_abline(intercept = 0, slope = 1, color="black", linetype="longdash", size=.5) +
theme(aspect.ratio = 1) +
theme_classic()
print(p.RC.comparison.1)
rm(df,test.RC.log2, bench.RC.log2)
# Histogram plot test data set (END only)
test.RC.log2 <- log2(rowMeans(test.RC[, test.groups=="END"])+1)
bench.RC.log2 <- log2(rowMeans(bench.RC[, bench.groups=="END"])+1)
df <- data.frame(test=test.RC.log2, benchmark=bench.RC.log2)
p.RC.comparison.END.1 <- ggplot(df,aes(x=test,y=benchmark)) +
geom_point(alpha=.25) +
xlab("mixed-batch (END)") +
ylab("single-batch (END)") +
ggtitle(" END --- log2(mean(read count) + 1)") +
geom_abline(intercept = 0, slope = 1, color="black", linetype="longdash", size=.5) +
theme(aspect.ratio = 1) +
theme_classic()
print(p.RC.comparison.END.1)
rm(df,test.RC.log2, bench.RC.log2)
# Histogram plot test data set (SER only)
test.RC.log2 <- log2(rowMeans(test.RC[, test.groups=="SER"])+1)
bench.RC.log2 <- log2(rowMeans(bench.RC[, bench.groups=="SER"])+1)
df <- data.frame(test=test.RC.log2, benchmark=bench.RC.log2)
p.RC.comparison.SER.1 <- ggplot(df,aes(x=test,y=benchmark)) +
geom_point(alpha=.25) +
xlab("mixed-batch (SER)") +
ylab("single-batch (SER)") +
ggtitle(" SER --- log2(mean(read count) + 1)") +
geom_abline(intercept = 0, slope = 1, color="black", linetype="longdash", size=.5) +
theme(aspect.ratio = 1) +
theme_classic()
print(p.RC.comparison.SER.1)
rm(df,test.RC.log2, bench.RC.log2)
## log2(mean(RC))
# Histogram plot test data set
test.RC.log2 <- rowMeans(log2(test.RC+1))
bench.RC.log2 <- rowMeans(log2(bench.RC+1))
df <- data.frame(test=test.RC.log2, benchmark=bench.RC.log2)
p.RC.comparison.2 <- ggplot(df,aes(x=test,y=benchmark)) +
geom_point(alpha=.25) +
xlab("mixed-batch") +
ylab("single-batch") +
ggtitle("mean(log2(read count + 1))") +
geom_abline(intercept = 0, slope = 1, color="black", linetype="longdash", size=.5) +
theme(aspect.ratio = 1) +
theme_classic()
print(p.RC.comparison.2)
rm(df,test.RC.log2, bench.RC.log2)
# Histogram plot test data set (END only)
test.RC.log2 <- rowMeans(log2(test.RC[, test.groups=="END"]+1))
bench.RC.log2 <- rowMeans(log2(bench.RC[, bench.groups=="END"]+1))
df <- data.frame(test=test.RC.log2, benchmark=bench.RC.log2)
p.RC.comparison.END.2 <- ggplot(df,aes(x=test,y=benchmark)) +
geom_point(alpha=.25) +
xlab("mixed-batch (END)") +
ylab("single-batch (END)") +
ggtitle(" END --- mean(log2(read count + 1))") +
geom_abline(intercept = 0, slope = 1, color="black", linetype="longdash", size=.5) +
theme(aspect.ratio = 1) +
theme_classic()
print(p.RC.comparison.END.2)
rm(df,test.RC.log2, bench.RC.log2)
# Histogram plot test data set (SER only)
test.RC.log2 <- rowMeans(log2(test.RC[, test.groups=="SER"]+1))
bench.RC.log2 <- rowMeans(log2(bench.RC[, bench.groups=="SER"]+1))
df <- data.frame(test=test.RC.log2, benchmark=bench.RC.log2)
p.RC.comparison.SER.2 <- ggplot(df,aes(x=test,y=benchmark)) +
geom_point(alpha=.25) +
xlab("mixed-batch (SER)") +
ylab("single-batch (SER)") +
ggtitle(" SER --- mean(log2(read count + 1))") +
geom_abline(intercept = 0, slope = 1, color="black", linetype="longdash", size=.5) +
theme(aspect.ratio = 1) +
theme_classic()
print(p.RC.comparison.SER.2)
rm(df,test.RC.log2, bench.RC.log2)
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_RC_Comparison_all.pdf"), p.RC.comparison.2, width = 5, height=5, device="pdf")
ggsave(paste0(settings$paper.fig.path, "UCEC_RC_Comparison_END.pdf"), p.RC.comparison.END.2, width = 5, height=5, device="pdf")
ggsave(paste0(settings$paper.fig.path, "UCEC_RC_Comparison_SER.pdf"), p.RC.comparison.SER.2, width = 5, height=5, device="pdf")
}
}
if(settings$generate.paper.figs) {
# Function for mean-variance plot for a given data set
mean.variance.plot <- function(data.RC, title, axis.limit=NULL) {
df <- data.frame(data.mean=log10(rowMeans(data.RC) + 1),
data.var =log10(rowVars(data.RC) + 1))
lin.fit <- lm(df$data.var ~ df$data.mean)$coefficients
p <- ggplot(df,aes(x=data.mean,y=data.var)) +
geom_point(alpha=.25) +
xlab("log10(Mean)") +
ylab("log10(Variance)") +
geom_abline(intercept = lin.fit[1], slope = lin.fit[2], color="red", linetype="longdash", size=1) +
geom_abline(intercept = 0, slope = 1, colour="blue") +
annotate("text", alpha=1, x=4, y=0.5, hjust=0, vjust=0, size=3, colour="red",
label=paste0("log10(Variance) = ", round(lin.fit[1],2), " + ", round(lin.fit[2],2), "*log10(Mean)")) +
xlim(0, ifelse(is.null(axis.limit), max(df), axis.limit)) +
ylim(0, ifelse(is.null(axis.limit), max(df), axis.limit)) +
ggtitle(title) +
theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) +
theme_classic()
return(p)
}
## Test data
# END and SER
p.meanvar.test <- mean.variance.plot(test.RC, "Mixed-batch Data", axis.limit=12.5)
print(p.meanvar.test)
# END (endometrioid)
p.meanvar.test.END <- mean.variance.plot(test.RC[, test.groups == "END"], "Mixed-batch Data - END (endometrioid)", axis.limit=12.5)
print(p.meanvar.test.END)
# SER (serous)
p.meanvar.test.SER <- mean.variance.plot(test.RC[, test.groups == "SER"], "Mixed-batch Data - SER (serous)", axis.limit=12.5)
print(p.meanvar.test.SER)
## Benchmark data
# END and SER
p.meanvar.bench <- mean.variance.plot(bench.RC, "Single-batch Data", axis.limit=12.5)
print(p.meanvar.bench)
# END
p.meanvar.bench.END <- mean.variance.plot(bench.RC[, bench.groups == "END"], "Single-batch Data - END (endometrioid)", axis.limit=12.5)
print(p.meanvar.bench.END)
# SER
p.meanvar.bench.SER <- mean.variance.plot(bench.RC[, bench.groups == "SER"], "Single-batch Data - SER (serous)", axis.limit=12.5)
print(p.meanvar.bench.SER)
}
if(settings$generate.paper.figs) {
## Mixed-batch data
# END and SER
p.meanvar.test <- plot.mean.sd(test.RC, config$t.zero, config$t.poor, config$t.well)
print(p.meanvar.test)
# END (endometrioid)
p.meanvar.test.END <- plot.mean.sd(test.RC[, test.groups == "END"], config$t.zero, config$t.poor, config$t.well, "Mixed-batch Data - END (endometrioid)")
print(p.meanvar.test.END)
# SER
p.meanvar.test.SER <- plot.mean.sd(test.RC[, test.groups == "SER"], config$t.zero, config$t.poor, config$t.well, "Mixed-batch Data - SER (serous)")
print(p.meanvar.test.SER)
## Single-Batch data
# END and SER
p.meanvar.bench <- plot.mean.sd(bench.RC, config$t.zero, config$t.poor, config$t.well, "Single-Batch Data")
print(p.meanvar.bench)
# END (endometrioid)
p.meanvar.bench.END <- plot.mean.sd(bench.RC[, bench.groups == "END"], config$t.zero, config$t.poor, config$t.well, "Single-Batch Data - END (endometrioid)")
print(p.meanvar.bench.END)
# SER
p.meanvar.bench.SER <- plot.mean.sd(bench.RC[, bench.groups == "SER"], config$t.zero, config$t.poor, config$t.well, "Single-Batch Data - SER (serous)")
print(p.meanvar.bench.SER)
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_Test_MeanSD.pdf"), p.meanvar.test + labs(title = NULL), width = 5, height=4, device="pdf")
}
}
if(settings$generate.paper.figs) {
num.genes.plot.pos <- 60
num.genes.plot.neg <- 60
# Co-expression models
bench.model <- DANA.bench$data.model
test.model <- DANA.test$data.model
test.norm.model <- DANA.test$norm.models$test.TMM
# Common control miRNAs
pos.controls.bench <- rownames(bench.model$pos$corr)
pos.controls.test <- rownames(test.model$pos$corr)
pos.controls.common <- intersect(pos.controls.test, pos.controls.bench)
neg.controls.bench <- rownames(bench.model$neg$corr)
neg.controls.test <- rownames(test.model$neg$corr)
# reduce the number of genes for the plot
num.genes.plot.pos <- min(num.genes.plot.pos, length(pos.controls.common))
pos.controls.plot <- pos.controls.common[1:num.genes.plot.pos]
num.genes.plot.neg <- min(num.genes.plot.neg,
dim(test.model$neg$corr)[1],
dim(bench.model$neg$corr)[1])
# Subsetted correlation matrices
corr.bench.pos <- bench.model$pos$corr[pos.controls.plot, pos.controls.plot]
corr.test.pos <- test.model$pos$corr[pos.controls.plot, pos.controls.plot]
corr.test.TMM.pos <- test.norm.model$pos$corr[pos.controls.plot, pos.controls.plot]
corr.bench.neg <- bench.model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
corr.test.neg <- test.model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
corr.test.TMM.neg <- test.norm.model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
### Generate plots
# Positive controls
p.corr.pos.bench <- ggplot.corr(corr.bench.pos,
clusters=clusters,
threshold=0,
title="Single-batch (un-normalized)",
coords=coords)
p.corr.pos.test <- ggplot.corr(corr.test.pos,
clusters=clusters,
threshold=0,
title="Mixed-batch (un-normalized)",
coords=coords)
p.corr.pos.test.TMM <- ggplot.corr(corr.test.TMM.pos,
clusters=clusters,
threshold=0,
title="Mixed-batch (TMM)",
coords=coords)
p.corr.pos <- arrangeGrob(p.corr.pos.bench + theme(legend.position="none"),
p.corr.pos.test + theme(legend.position="none"),
get.legend(p.corr.pos.bench),
widths = c(2,2,1))
grid.arrange(p.corr.pos) # display plot
p.corr.pos.three <- arrangeGrob(p.corr.pos.bench + theme(legend.position="none"),
p.corr.pos.test.TMM + theme(legend.position="none"),
p.corr.pos.test + theme(legend.position="none"),
get.legend(p.corr.pos.bench + theme(legend.position = "bottom")),
layout_matrix=rbind(c(1,2,3), c(4,4,4)),
heights = c(5,1))
grid.arrange(p.corr.pos.three) # display plot
# Negative controls
p.corr.neg.bench <- ggplot.corr(corr.bench.neg,
clusters=clusters,
threshold=0,
title="Single-batch (un-normalized)")
p.corr.neg.test <- ggplot.corr(corr.test.neg,
clusters=clusters,
threshold=0,
title="Mixed-batch (un-normalized)")
p.corr.neg.test.TMM <- ggplot.corr(corr.test.TMM.neg,
clusters=clusters,
threshold=0,
title="Mixed-batch (TMM)")
p.corr.neg <- arrangeGrob(p.corr.neg.bench + theme(legend.position="none"),
p.corr.neg.test + theme(legend.position="none"),
get.legend(p.corr.neg.bench),
widths = c(2,2,1))
grid.arrange(p.corr.neg) # display plot
p.corr.neg.three <- arrangeGrob(p.corr.neg.bench + theme(legend.position="none"),
p.corr.neg.test.TMM + theme(legend.position="none"),
p.corr.neg.test + theme(legend.position="none"),
get.legend(p.corr.neg.bench + theme(legend.position = "bottom")),
layout_matrix=rbind(c(1,1,2,2,3,3), c(4,4,4,4,4,4)),
heights = c(5,1))
grid.arrange(p.corr.neg.three) # display plot
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_CorrPos.pdf"), p.corr.pos, width = 8, height=3.5, device="pdf")
ggsave(paste0(settings$paper.fig.path, "UCEC_CorrNeg.pdf"), p.corr.neg, width = 8, height=3.5, device="pdf")
ggsave(paste0(settings$paper.fig.path, "UCEC_CorrPos_TMM.pdf"), p.corr.pos.three, width = 8, height=3.5, device="pdf")
ggsave(paste0(settings$paper.fig.path, "UCEC_CorrNeg_TMM.pdf"), p.corr.neg.three, width = 8, height=3.5, device="pdf")
}
}
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
if(settings$generate.paper.figs) {
p.corr.test <- ggplot.corr(DANA.test$data.model$pos$corr,
clusters=clusters,
title="mixed-batch (un-normalized)")
p.corr.TMM <- ggplot.corr(DANA.test$norm.models$test.TMM$pos$corr,
clusters=clusters,
title="TMM")
p.corr.DESeq <- ggplot.corr(DANA.test$norm.models$test.DESeq$pos$corr,
clusters=clusters,
title="DESeq")
p.corr.PoissonSeq <- ggplot.corr(DANA.test$norm.models$test.PoissonSeq$pos$corr,
clusters=clusters,
title="PoissonSeq")
p.corr.TC <- ggplot.corr(DANA.test$norm.models$test.TC$pos$corr,
clusters=clusters,
title="TC")
p.corr.Med <- ggplot.corr(DANA.test$norm.models$test.Med$pos$corr,
clusters=clusters,
title="Med")
p.corr.RUVg <- ggplot.corr(DANA.test$norm.models$test.RUVg$pos$corr,
clusters=clusters,
title="RUVg")
p.corr.RUVs <- ggplot.corr(DANA.test$norm.models$test.RUVs$pos$corr,
clusters=clusters,
title="RUVs")
p.corr.RUVr <- ggplot.corr(DANA.test$norm.models$test.RUVr$pos$corr,
clusters=clusters,
title="RUVr")
p.corr.UQ <- ggplot.corr(DANA.test$norm.models$test.UQ$pos$corr,
clusters=clusters,
title="UQ")
p.corr.QN <- ggplot.corr(DANA.test$norm.models$test.QN$pos$corr,
clusters=clusters,
title="QN")
# single-batch data
DANA.bench.plot <- DANA(data.RC=bench.RC,
data.norm=bench.norm,
pos.controls=pos.controls.test,
neg.controls=neg.controls.bench,
clusters=clusters.test,
coords=coords,
case.name="benchmark",
generate.plots=config$generate.plots,
preprocess.transform=config$preprocess.transform,
corr.method.pos=config$corr.method.pos,
tuning.criterion.pos=config$tuning.criterion.pos,
corr.method.neg=config$corr.method.neg,
tuning.criterion.neg=config$tuning.criterion.neg)
p.corr.bench <- ggplot.corr(DANA.bench.plot$data.model$pos$corr,
clusters=clusters,
title="single-batch (un-normalized)")
# Arrange plots
p.corr.test.all <-
plot_grid(p.corr.bench + theme(legend.position="none"),
p.corr.test + theme(legend.position="none"),
p.corr.TMM + theme(legend.position="none"),
p.corr.DESeq + theme(legend.position="none"),
p.corr.PoissonSeq + theme(legend.position="none"),
p.corr.QN + theme(legend.position="none"),
p.corr.RUVg + theme(legend.position="none"),
p.corr.RUVs + theme(legend.position="none"),
p.corr.RUVr + theme(legend.position="none"),
p.corr.Med + theme(legend.position="none"),
p.corr.TC + theme(legend.position="none"),
p.corr.UQ + theme(legend.position="none"),
ncol=3)
plot(p.corr.test.all)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_Test_Corr_All.pdf"), p.corr.test.all, width=9, height=12, device="pdf")
}
}
## Estimating correlations for pos. and neg. controls for data set benchmark...done
## Estimating correlations for pos. and neg. controls for data set benchmark.TMM...done
## Estimating correlations for pos. and neg. controls for data set benchmark.DESeq...done
## Estimating correlations for pos. and neg. controls for data set benchmark.PoissonSeq...done
## Estimating correlations for pos. and neg. controls for data set benchmark.TC...done
## Estimating correlations for pos. and neg. controls for data set benchmark.Med...done
## Estimating correlations for pos. and neg. controls for data set benchmark.RUVg...done
## Estimating correlations for pos. and neg. controls for data set benchmark.RUVs...done
## Estimating correlations for pos. and neg. controls for data set benchmark.RUVr...done
## Estimating correlations for pos. and neg. controls for data set benchmark.UQ...done
## Estimating correlations for pos. and neg. controls for data set benchmark.QN...done
## Comparing models benchmark vs. benchmark.TMM....done
## Comparing models benchmark vs. benchmark.DESeq....done
## Comparing models benchmark vs. benchmark.PoissonSeq....done
## Comparing models benchmark vs. benchmark.TC....done
## Comparing models benchmark vs. benchmark.Med....done
## Comparing models benchmark vs. benchmark.RUVg....done
## Comparing models benchmark vs. benchmark.RUVs....done
## Comparing models benchmark vs. benchmark.RUVr....done
## Comparing models benchmark vs. benchmark.UQ....done
## Comparing models benchmark vs. benchmark.QN....done
if(settings$generate.paper.figs) {
models <- list(
"Single-batch" = DANA.bench$data.model,
"Mixed-batch" = DANA.test$data.model,
"Mixed-batch (TMM)" = DANA.test$norm.models$test.TMM,
"Mixed-batch (DESeq)" = DANA.test$norm.models$test.DESeq,
"Mixed-batch (PoissonSeq)" = DANA.test$norm.models$test.PoissonSeq,
"Mixed-batch (RUVg)" = DANA.test$norm.models$test.RUVg,
"Mixed-batch (RUVs)" = DANA.test$norm.models$test.RUVs,
"Mixed-batch (RUVr)" = DANA.test$norm.models$test.RUVr,
"Mixed-batch (TC)" = DANA.test$norm.models$test.TC,
"Mixed-batch (Med)" = DANA.test$norm.models$test.Med,
"Mixed-batch (UQ)" = DANA.test$norm.models$test.UQ,
"Mixed-batch (QN)" = DANA.test$norm.models$test.QN
)
# Set number of genes for negative correlation to minimum found
num.genes.plot <- min(sapply(models, function(x) dim(x$neg$corr)[1]))
# Subsetted correlation matrices
corrs <- lapply(models, function(x) x$neg$corr[1:num.genes.plot, 1:num.genes.plot])
corrs <- lapply(corrs, function(x) x[upper.tri(x)])
control <- factor(
as.vector(sapply(names(corrs), function(x) rep(x,length(corrs[[1]])))),
levels = names(models)
)
neg.corr <- data.frame(
control=factor(control),
corr=unname(unlist(corrs))
)
# plot insert
p.insert <- ggplot(neg.corr, aes(x=corr, colour=control, linetype=control)) +
geom_vline(xintercept = 0, size=.4, color="gray80", linetype="solid") +
geom_freqpoly(binwidth=.05, alpha=.9, size=.3) +
theme_bw() +
xlim(-1, 1) +
scale_color_manual(values=brewer.pal(n=12, name="Paired")) +
theme(legend.position="none",
axis.title.x = element_blank(),
axis.title.y = element_blank())
p.corr.frequency.neg.controls.all <- ggplot(neg.corr, aes(x=corr, colour=control, linetype=control)) +
geom_vline(xintercept = 0, size=.4, color="gray80", linetype="solid") +
geom_freqpoly(binwidth=.05, alpha=.9, size=.75) +
theme_classic() +
xlim(-1, 1) +
ylim(0,200) +
scale_color_manual(values=brewer.pal(n=12, name="Paired")) +
theme(legend.position=c(0.84,0.8),
legend.title=element_blank(),
legend.background=element_rect(fill=alpha('white', 0.5))) +
ylab("Frequency") +
xlab("Marginal correlation") +
theme(legend.key.width = unit(2.4,"cm")) +
annotation_custom(ggplotGrob(p.insert), xmin=-1.1, xmax=-0.3, ymin=100, ymax=200)
print(p.corr.frequency.neg.controls.all)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_CorrDensityNeg_Freq_All.pdf"), p.corr.frequency.neg.controls.all, width = 8, height=8, device="pdf")
}
}
## Warning: Removed 24 row(s) containing missing values (geom_path).
## Warning: Removed 24 row(s) containing missing values (geom_path).
## Warning: Removed 24 row(s) containing missing values (geom_path).
if(settings$generate.paper.figs) {
par(mfrow=c(1,1))
# un-normalized vs TMM
p.scatter.test.TMM <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.TMM$pos$corr,
clusters=clusters.test, title="TMM", xlab="un-normalized", ylab="TMM",
ccc=round(DANA.test$metrics["test.TMM", "cc"],3))
# un-normalized vs DESeq
p.scatter.test.DESeq <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.DESeq$pos$corr,
clusters=clusters.test, title="DESeq", xlab="un-normalized", ylab="DESeq",
ccc=round(DANA.test$metrics["test.DESeq", "cc"],3))
# un-normalized vs PoissonSeq
p.scatter.test.PoissonSeq <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.PoissonSeq$pos$corr,
clusters=clusters.test, title="PoissonSeq", xlab="un-normalized", ylab="PoissonSeq",
ccc=round(DANA.test$metrics["test.PoissonSeq", "cc"],3))
# un-normalized vs TC
p.scatter.test.TC <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.TC$pos$corr,
clusters=clusters.test, title="TC", xlab="un-normalized", ylab="TC",
ccc=round(DANA.test$metrics["test.TC", "cc"],3))
# un-normalized vs Med
p.scatter.test.Med <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.Med$pos$corr,
clusters=clusters.test, title="Med", xlab="un-normalized", ylab="Med",
ccc=round(DANA.test$metrics["test.Med", "cc"],3))
# un-normalized vs RUVg
p.scatter.test.RUVg <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.RUVg$pos$corr,
clusters=clusters.test, title="RUVg", xlab="un-normalized", ylab="RUVg",
ccc=round(DANA.test$metrics["test.RUVg", "cc"],3))
# un-normalized vs RUVs
p.scatter.test.RUVs <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.RUVs$pos$corr,
clusters=clusters.test, title="RUVs", xlab="un-normalized", ylab="RUVs",
ccc=round(DANA.test$metrics["test.RUVs", "cc"],3))
# un-normalized vs RUVr
p.scatter.test.RUVr <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.RUVr$pos$corr,
clusters=clusters.test, title="RUVr", xlab="un-normalized", ylab="RUVr",
ccc=round(DANA.test$metrics["test.RUVr", "cc"],3))
# un-normalized vs UQ
p.scatter.test.UQ <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.UQ$pos$corr,
clusters=clusters.test, title="UQ", xlab="un-normalized", ylab="UQ",
ccc=round(DANA.test$metrics["test.UQ", "cc"],3))
# un-normalized vs QN
p.scatter.test.QN <- plot.corr.scatter(
corr1=DANA.test$data.model$pos$corr,
corr2=DANA.test$norm.models$test.QN$pos$corr,
clusters=clusters.test, title="QN", xlab="un-normalized", ylab="QN",
ccc=round(DANA.test$metrics["test.QN", "cc"],3))
p.scatter.test.all <- plot_grid(p.scatter.test.TMM,
p.scatter.test.DESeq,
p.scatter.test.PoissonSeq,
p.scatter.test.TC,
p.scatter.test.Med,
p.scatter.test.RUVg,
p.scatter.test.RUVs,
p.scatter.test.RUVr,
p.scatter.test.UQ,
p.scatter.test.QN,
ncol = 3,
align="hv")
plot(p.scatter.test.all)
# Save plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_Scatter_All.pdf"), p.scatter.test.all, width=9, height=12, device="pdf")
}
}
## Warning in plot.corr.scatter(corr1 = DANA.test$data.model$pos$corr, corr2 = DANA.test$norm.models$test.RUVs$pos$corr, : Warning: Shapes of correlation matrices in plot.corr.scatter do not match!
## Reducing correlation matrices to all mutual genes.
## Warning: Removed 2 rows containing missing values (geom_point).
if(settings$generate.paper.figs) {
options(scipen=4) # force non-scientific notation of x axis
test.DANA.metrics <- data.frame(
method=substr(rownames(DANA.test$metrics), 6, 20),
cc=DANA.test$metrics[, "cc"],
mscr=DANA.test$metrics[, "mscr"]
)
p.metrics.test <- ggplot(test.DANA.metrics,aes(x=mscr,y=cc, label=method)) +
geom_point(alpha=.5) + #pos=position_jitter(width=0.001, seed=1),
theme_classic() +
xlab(TeX("$mscr^-$; Relative reduction of handling effects")) +
ylab(TeX("$cc^+$; Biological signal preservation")) +
geom_text_repel(aes(label = method), size=3, max.overlaps = Inf, min.segment.length = 0.2, force = 1, box.padding = 0.2) +
# xlim(c(min(c(0, test.mposc.reduction)),max(test.mposc.reduction))) +
scale_x_continuous(labels = scales::percent, limits=c(0,1.05),
breaks = scales::pretty_breaks(n = 5)) +
# ylim(c(0,1))
scale_y_continuous(labels = scales::percent, limits = c(0.5,1))
print(p.metrics.test)
print("DANA Statistics for mixed-batch data")
test.DANA.metrics$cc <- round(test.DANA.metrics$cc,3)
test.DANA.metrics$mscr <- round(test.DANA.metrics$mscr,3)
print(test.DANA.metrics)
bench.DANA.metrics <- data.frame(
method=substr(rownames(DANA.bench$metrics), 11, 30),
cc=DANA.bench$metrics[, "cc"],
mscr=DANA.bench$metrics[, "mscr"]
)
p.metrics.bench <- ggplot(bench.DANA.metrics, aes(x=mscr, y=cc, label=method)) +
geom_point(alpha=.5) +
theme_classic() +
xlab(TeX("$mscr^-$; Relative reduction of handling effects")) +
ylab(TeX("$cc^+$; Biological signal preservation")) +
geom_text_repel(aes(label = method), size=3) +
scale_x_continuous(labels = scales::percent, limits=c(0, 1.05),
breaks = scales::pretty_breaks(n = 5)) +
scale_y_continuous(labels = scales::percent, limits = c(0.5, 1))
print(p.metrics.bench)
print("DANA Statistics for single-batch data")
bench.DANA.metrics$cc <- round(bench.DANA.metrics$cc,3)
bench.DANA.metrics$mscr <- round(bench.DANA.metrics$mscr,3)
print(bench.DANA.metrics)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_MetricsTest.pdf"), p.metrics.test, width=5.5, height=4.5, device="pdf")
ggsave(paste0(settings$paper.fig.path, "UCEC_MetricsBench.pdf"), p.metrics.bench, width=4.5, height=3.5, device="pdf")
}
}
## [1] "DANA Statistics for mixed-batch data"
## method cc mscr
## 1 TMM 0.985 0.474
## 2 DESeq 0.988 0.434
## 3 PoissonSeq 0.987 0.476
## 4 TC 0.980 0.073
## 5 Med 0.988 0.276
## 6 RUVg 0.980 0.422
## 7 RUVs 0.866 0.441
## 8 RUVr 0.970 0.486
## 9 UQ 0.993 0.246
## 10 QN 0.946 0.483
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
## [1] "DANA Statistics for single-batch data"
## method cc mscr
## 1 TMM 0.984 0.115
## 2 DESeq 0.994 0.107
## 3 PoissonSeq 0.994 0.150
## 4 TC 0.982 -0.269
## 5 Med 0.914 0.168
## 6 RUVg 0.983 0.056
## 7 RUVs 0.941 0.090
## 8 RUVr 0.970 0.197
## 9 UQ 0.924 0.128
## 10 QN 0.933 0.156
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
if(settings$generate.paper.figs && settings$perform.DEA) {
# FNR-FDR Plot
test.DEA.res <- data.frame(method=substr(names(test.norm.DEA.stats), 6, 20),
FDR=-sapply(test.norm.DEA.stats, function(x) x$FDR)+1,
FNR=-sapply(test.norm.DEA.stats, function(x) x$FNR)+1)
p.test.DEA.FNR.FDR <- ggplot(test.DEA.res, aes(x=FNR, y=FDR, label=method)) +
theme_classic() +
geom_point(alpha=.5) +
xlab("Sensitivity") + # True positive rate(1-FNR)
ylab("Positive predictive value") + # Positive predictive value (1-FDR)
geom_text_repel(aes(label = method), size=3, max.overlaps = Inf) +
scale_x_continuous(labels = scales::percent, limits=c(0,1),
breaks = scales::pretty_breaks(n = 4)) +
scale_y_continuous(labels = scales::percent, limits = c(0,1))
print(p.test.DEA.FNR.FDR)
# CAT Plot
d <- append(list("test.No Norm"=test.DEA), test.norm.DEA)
names(d) <- substr(names(d), 6, 20)
p.test.DEA.CAT.1 <- plot.CAT(DEA=d[1:7],
truth.DEA=bench.DEA,
title="Significance ranks of miRNAs",
maxrank=100)
print(p.test.DEA.CAT.1)
p.test.DEA.CAT.2 <- plot.CAT(DEA=d[c(1,8:11)],
truth.DEA=bench.DEA,
title="Significance ranks of miRNAs",
maxrank=100)
print(p.test.DEA.CAT.2)
rm(d)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_DEA_FNR_FDR.pdf"), p.test.DEA.FNR.FDR, width=5.5, height=4.5, device="pdf")
ggsave(paste0(settings$paper.fig.path, "UCEC_DEA_CAT.pdf"), p.test.DEA.CAT, width=4.5, height=3.5, device="pdf")
}
}
if(settings$generate.paper.figs && settings$perform.DEA) {
p.results.paper <-
plot_grid(p.RC.hist,
p.meanvar.test + labs(title = NULL),
p.metrics.test,
p.test.DEA.FNR.FDR,
align="hv",
labels=c("a", "b", "c", "d"))
plot(p.results.paper)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "UCEC_Results.pdf"), p.results.paper, width=10, height=8.5, device="pdf")
}
}
## Warning: Removed 1 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_bar).
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
##
## attached base packages:
## [1] splines stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggcorrplot_0.1.3 latex2exp_0.5.0
## [3] RColorBrewer_1.1-2 cowplot_1.1.1
## [5] openxlsx_4.2.4 ffpe_1.38.0
## [7] TTR_0.24.2 DescTools_0.99.44
## [9] vsn_3.62.0 RUVSeq_1.28.0
## [11] EDASeq_2.28.0 ShortRead_1.52.0
## [13] GenomicAlignments_1.30.0 Rsamtools_2.10.0
## [15] Biostrings_2.62.0 XVector_0.34.0
## [17] sva_3.42.0 BiocParallel_1.28.2
## [19] genefilter_1.76.0 mgcv_1.8-38
## [21] nlme_3.1-153 PoissonSeq_1.1.2
## [23] combinat_0.0-8 DESeq2_1.34.0
## [25] SummarizedExperiment_1.24.0 Biobase_2.54.0
## [27] MatrixGenerics_1.6.0 matrixStats_0.61.0
## [29] GenomicRanges_1.46.1 GenomeInfoDb_1.30.0
## [31] IRanges_2.28.0 S4Vectors_0.32.3
## [33] BiocGenerics_0.40.0 edgeR_3.36.0
## [35] limma_3.50.0 FastGGM_1.0
## [37] RcppParallel_5.1.4 Rcpp_1.0.7
## [39] huge_1.3.5 glmnet_4.1-3
## [41] Matrix_1.3-4 ggrepel_0.9.1
## [43] plotly_4.10.0 stargazer_5.2.2
## [45] corrplot_0.92 ggnewscale_0.4.5
## [47] gridExtra_2.3 ggplot2_3.3.5
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 R.utils_2.11.0
## [3] tidyselect_1.1.1 RSQLite_2.2.9
## [5] AnnotationDbi_1.56.2 htmlwidgets_1.5.4
## [7] grid_4.1.2 munsell_0.5.0
## [9] codetools_0.2-18 preprocessCore_1.56.0
## [11] nleqslv_3.3.2 withr_2.4.3
## [13] colorspace_2.0-2 filelock_1.0.2
## [15] highr_0.9 knitr_1.36
## [17] rstudioapi_0.13 labeling_0.4.2
## [19] GenomeInfoDbData_1.2.7 hwriter_1.3.2
## [21] farver_2.1.0 bit64_4.0.5
## [23] rhdf5_2.38.0 vctrs_0.3.8
## [25] generics_0.1.1 xfun_0.28
## [27] BiocFileCache_2.2.0 R6_2.5.1
## [29] illuminaio_0.36.0 locfit_1.5-9.4
## [31] reshape_0.8.8 bitops_1.0-7
## [33] rhdf5filters_1.6.0 cachem_1.0.6
## [35] DelayedArray_0.20.0 assertthat_0.2.1
## [37] BiocIO_1.4.0 scales_1.1.1
## [39] rootSolve_1.8.2.3 gtable_0.3.0
## [41] affy_1.72.0 methylumi_2.40.1
## [43] lmom_2.8 rlang_0.4.12
## [45] rtracklayer_1.54.0 lazyeval_0.2.2
## [47] GEOquery_2.62.1 BiocManager_1.30.16
## [49] yaml_2.2.1 crosstalk_1.2.0
## [51] GenomicFeatures_1.46.1 tools_4.1.2
## [53] nor1mix_1.3-0 affyio_1.64.0
## [55] ellipsis_0.3.2 lumi_2.46.0
## [57] jquerylib_0.1.4 siggenes_1.68.0
## [59] proxy_0.4-26 plyr_1.8.6
## [61] sparseMatrixStats_1.6.0 progress_1.2.2
## [63] zlibbioc_1.40.0 purrr_0.3.4
## [65] RCurl_1.98-1.5 prettyunits_1.1.1
## [67] openssl_1.4.5 bumphunter_1.36.0
## [69] zoo_1.8-9 sfsmisc_1.1-12
## [71] magrittr_2.0.1 data.table_1.14.2
## [73] mvtnorm_1.1-3 aroma.light_3.24.0
## [75] hms_1.1.1 evaluate_0.14
## [77] xtable_1.8-4 XML_3.99-0.8
## [79] jpeg_0.1-9 mclust_5.4.8
## [81] shape_1.4.6 compiler_4.1.2
## [83] biomaRt_2.50.1 minfi_1.40.0
## [85] tibble_3.1.6 KernSmooth_2.23-20
## [87] crayon_1.4.2 R.oo_1.24.0
## [89] htmltools_0.5.2 tzdb_0.2.0
## [91] tidyr_1.1.4 geneplotter_1.72.0
## [93] expm_0.999-6 Exact_3.1
## [95] DBI_1.1.1 dbplyr_2.1.1
## [97] MASS_7.3-54 rappdirs_0.3.3
## [99] boot_1.3-28 readr_2.1.1
## [101] quadprog_1.5-8 R.methodsS3_1.8.1
## [103] parallel_4.1.2 igraph_1.2.9
## [105] pkgconfig_2.0.3 xml2_1.3.3
## [107] foreach_1.5.1 annotate_1.72.0
## [109] rngtools_1.5.2 multtest_2.50.0
## [111] beanplot_1.2 doRNG_1.8.2
## [113] scrime_1.3.5 stringr_1.4.0
## [115] digest_0.6.29 base64_2.0
## [117] rmarkdown_2.11 gld_2.6.3
## [119] DelayedMatrixStats_1.16.0 restfulr_0.0.13
## [121] curl_4.3.2 rjson_0.2.20
## [123] lifecycle_1.0.1 jsonlite_1.7.2
## [125] Rhdf5lib_1.16.0 askpass_1.1
## [127] viridisLite_0.4.0 fansi_0.5.0
## [129] pillar_1.6.4 lattice_0.20-45
## [131] KEGGREST_1.34.0 fastmap_1.1.0
## [133] httr_1.4.2 survival_3.2-13
## [135] glue_1.5.1 xts_0.12.1
## [137] zip_2.2.0 png_0.1-7
## [139] iterators_1.0.13 bit_4.0.4
## [141] class_7.3-19 stringi_1.7.6
## [143] HDF5Array_1.22.1 blob_1.2.2
## [145] latticeExtra_0.6-29 memoise_2.0.1
## [147] dplyr_1.0.7 e1071_1.7-9